import os
import numpy as np
import pandas as pd
from fast_ml import eda
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'
from sklearn.feature_selection import SelectKBest,f_classif
from sklearn.impute import SimpleImputer
import gc
import time
import warnings
warnings.filterwarnings("ignore")
from contextlib import contextmanager
@contextmanager
def timer(title):
t0 = time.perf_counter()
yield
print("{} - done in {:.0f}s".format(title, time.perf_counter() - t0))
def read_data(csv_file_path):
df = pd.read_csv(csv_file_path)
print("Shape of the dataframe :" + str(df.shape))
reduce_mem_usage(df)
return df
def reduce_mem_usage(df):
""" iterate through all the columns of a dataframe and modify the data type.
to reduce memory usage.
1. Iterate over every column
2. Determine if the column is numeric
3. Determine if the column can be represented by an integer
4. Find the min and the max value
5. Determine and apply the smallest datatype that can fit the range of values
"""
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if (col_type != object) and (col_type != 'category'):
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(
100 * (start_mem - end_mem) / start_mem))
def show_infos(df):
cols = df.columns
if 'TARGET' in cols :
df_0 = df[df.TARGET == 0]
df_1 = df[df.TARGET == 1]
null_values = df.isna().sum().values
null_values_0 = df_0.isna().sum().values
null_values_1 = df_1.isna().sum().values
types = df.dtypes.values
info_df = pd.DataFrame(
{"name": cols, 'total_null': null_values, "nulls for target=0": null_values_0,
"nulls for target=1": null_values_1})
info_df['% nulls for target=0'] = round(100*info_df['nulls for target=0']/len(df_0), 2)
info_df['% nulls for target=1'] = round(100*info_df['nulls for target=1']/len(df_1), 2)
info_df['type'] = types
info_df = info_df.sort_values(by='total_null', ascending=False)
else :
null_values = df.isna().sum().values
types = df.dtypes.values
info_df = pd.DataFrame(
{"name": cols, "nulls": null_values})
info_df['% nulls'] = round(100*info_df['nulls']/len(df), 2)
info_df['type'] = types
info_df = info_df.sort_values(by='nulls', ascending=False)
return info_df
def show_category_details(df):
for column in df.select_dtypes('category').columns:
print(column)
print(df[column].value_counts(normalize=True, dropna=False))
def category_columns(df):
categ_col = []
for column in df.select_dtypes('category').columns:
categ_col = categ_col.append(column)
categ_col
def list_emptiest_columns(df, threshold):
infos = show_infos(df)
return list(infos[infos['% nulls'] > threshold*100]['name'].values)
def list_unfrequent_category_values(df, frequency):
unfreq_categ_dict = {}
for column in df.select_dtypes('category').columns:
categ_count = pd.DataFrame(
df[column].value_counts(normalize=True, dropna=False))
unfreq_categ = categ_count[categ_count[column] < frequency].index
if len(list(unfreq_categ)) > 1:
print(column)
unfreq_categ_dict[column] = list(unfreq_categ)
return unfreq_categ_dict
def list_overfrequent_category_values(df, frequency):
freq_categ_dict = {}
for column in df.select_dtypes('category').columns:
categ_count = pd.DataFrame(
df[column].value_counts(normalize=True, dropna=False))
freq_categ = categ_count[categ_count[column] > frequency].index
if len(freq_categ) > 0:
print(column)
freq_categ_dict[column] = list(freq_categ)
return freq_categ_dict
def list_binary_cat_cols(df):
binary_cols = [col for col in df if ((df[col].dtypes == 'category') and (
col != 'TARGET') and (len(df[col].unique()) == 2))]
return binary_cols
def list_binary_cols(df):
binary_cols = [col for col in df if ((df[col].dtypes != 'category') and (
col != 'TARGET') and (len(df[col].unique()) == 2))]
return binary_cols
def get_numerical_non_binary_columns(df) :
binary_cols = list_binary_cols(df)
to_ignore = binary_cols + ['SK_ID_CURR']
int_columns = [col for col in df if str(df[col].dtypes).startswith('int')]
# Les colonnes float
float_columns = [col for col in df if str(df[col].dtypes).startswith('float')]
num_columns = float_columns + int_columns
num_features = [ele for ele in num_columns if ele not in to_ignore]
return num_features
def one_hot_encoder(df, nan_as_category=True):
original_columns = list(df.columns)
categorical_columns = [
col for col in df.columns if df[col].dtype == 'category']
df = pd.get_dummies(df, columns=categorical_columns,
dummy_na=nan_as_category)
new_columns = [c for c in df.columns if c not in original_columns]
return df, new_columns
def get_percentiles(data, feature_name):
target_0_data = data[data['TARGET'] == 0][feature_name].dropna()
target_1_data = data[data['TARGET'] == 1][feature_name].dropna()
df = pd.DataFrame(columns=["Percentile", "TARGET", 'Valeur'])
for i in range(0, 101, 10):
new_row1 = pd.Series({'Percentile': i, 'TARGET': '0', 'Valeur': np.round(
np.percentile(target_0_data, i), 3)})
new_row2 = pd.Series({'Percentile': i, 'TARGET': '1', 'Valeur': np.round(
np.percentile(target_1_data, i), 3)})
df = pd.concat([df, new_row1.to_frame().T], ignore_index=True)
df = pd.concat([df, new_row2.to_frame().T], ignore_index=True)
return df
def get_features_scores(df, features_to_check, test_func):
fkbest_df = pd.DataFrame(columns = ['feature','score', 'p_value'])
for col in features_to_check :
subdf = df[[col,'TARGET']].dropna()
X = subdf[[col]]
y = subdf[['TARGET']]
if test_func == f_classif :
X = subdf[[col]].values.reshape(-1,1)
y = subdf[['TARGET']].values.reshape(-1,1)
score, p_value = test_func(X,y)
fkbest_df = fkbest_df.append({'feature': col, 'score': score[0], 'p_value': p_value[0]}, ignore_index=True)
return fkbest_df.sort_values(by='score', ascending=False)
data_dir = '../../data/'
application = read_data(data_dir+'application_train.csv')
Shape of the dataframe :(307511, 122) Memory usage of dataframe is 286.23 MB Memory usage after optimization is: 59.54 MB Decreased by 79.2%
Le jeu de données est pour le moins déséquilibré.
distrib_target = pd.DataFrame(application.TARGET.value_counts()).reset_index(
).rename(columns={'index': 'target', 'TARGET': 'count'})
fig = px.pie(distrib_target, values='count', names='target')
fig.update_layout(title = 'Distribution des négatifs et des positifs', height=400, width=400)
fig.show()
summary_df = eda.df_info(application)
summary_df.sort_values(by='num_unique_values', ascending=True)
| data_type | data_type_grp | num_unique_values | sample_unique_values | num_missing | perc_missing | |
|---|---|---|---|---|---|---|
| LIVE_CITY_NOT_WORK_CITY | int8 | Numerical | 2 | [0, 1] | 0 | 0.0 |
| FLAG_DOCUMENT_18 | int8 | Numerical | 2 | [0, 1] | 0 | 0.0 |
| FLAG_DOCUMENT_14 | int8 | Numerical | 2 | [0, 1] | 0 | 0.0 |
| FLAG_MOBIL | int8 | Numerical | 2 | [1, 0] | 0 | 0.0 |
| FLAG_EMP_PHONE | int8 | Numerical | 2 | [1, 0] | 0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... |
| EXT_SOURCE_2 | float16 | Numerical | 6888 | [0.262939453125, 0.6220703125, 0.55615234375, ... | 660 | 0.214626 |
| DAYS_EMPLOYED | int32 | Numerical | 12574 | [-637, -1188, -225, -3039, -3038, -1588, -3130... | 0 | 0.0 |
| AMT_ANNUITY | float32 | Numerical | 13672 | [24700.5, 35698.5, 6750.0, 29686.5, 21865.5, 2... | 12 | 0.003902 |
| DAYS_BIRTH | int16 | Numerical | 17460 | [-9461, -16765, -19046, -19005, -19932, -16941... | 0 | 0.0 |
| SK_ID_CURR | int32 | Numerical | 307511 | [100002, 100003, 100004, 100006, 100007, 10000... | 0 | 0.0 |
122 rows × 6 columns
Extraction des informations sur le lieu de vie du client
mod_medi_avg_col_list = [col for col in application.columns if (
col.endswith('_MODE') or col.endswith('_MEDI') or col.endswith('AVG'))]
total_list = mod_medi_avg_col_list
env_application = application[total_list + ['TARGET']]
del mod_medi_avg_col_list,
gc.collect()
summary_df = eda.df_info(env_application)
summary_df.sort_values(by='perc_missing', ascending=False)
| data_type | data_type_grp | num_unique_values | sample_unique_values | num_missing | perc_missing | |
|---|---|---|---|---|---|---|
| COMMONAREA_MEDI | float16 | Numerical | 2849 | [0.01439666748046875, 0.060791015625, nan, 0.0... | 214865 | 69.872297 |
| COMMONAREA_AVG | float16 | Numerical | 2841 | [0.0142974853515625, 0.06048583984375, nan, 0.... | 214865 | 69.872297 |
| COMMONAREA_MODE | float16 | Numerical | 2808 | [0.01439666748046875, 0.049713134765625, nan, ... | 214865 | 69.872297 |
| NONLIVINGAPARTMENTS_MODE | float16 | Numerical | 167 | [0.0, nan, 0.019500732421875, 0.00780105590820... | 213514 | 69.432963 |
| NONLIVINGAPARTMENTS_AVG | float16 | Numerical | 386 | [0.0, 0.0039005279541015625, nan, 0.0193023681... | 213514 | 69.432963 |
| NONLIVINGAPARTMENTS_MEDI | float16 | Numerical | 214 | [0.0, 0.0039005279541015625, nan, 0.0193939208... | 213514 | 69.432963 |
| FONDKAPREMONT_MODE | category | category | 4 | [reg oper account, nan, org spec account, reg ... | 210295 | 68.386172 |
| LIVINGAPARTMENTS_MODE | float16 | Numerical | 736 | [0.022003173828125, 0.0789794921875, nan, 0.13... | 210199 | 68.354953 |
| LIVINGAPARTMENTS_AVG | float16 | Numerical | 1840 | [0.02020263671875, 0.0772705078125, nan, 0.120... | 210199 | 68.354953 |
| LIVINGAPARTMENTS_MEDI | float16 | Numerical | 1097 | [0.0204925537109375, 0.07867431640625, nan, 0.... | 210199 | 68.354953 |
| FLOORSMIN_MODE | float16 | Numerical | 25 | [0.125, 0.333251953125, nan, 0.375, 0.70849609... | 208642 | 67.84863 |
| FLOORSMIN_MEDI | float16 | Numerical | 47 | [0.125, 0.333251953125, nan, 0.375, 0.70849609... | 208642 | 67.84863 |
| FLOORSMIN_AVG | float16 | Numerical | 304 | [0.125, 0.333251953125, nan, 0.375, 0.70849609... | 208642 | 67.84863 |
| YEARS_BUILD_MEDI | float16 | Numerical | 151 | [0.62451171875, 0.798828125, nan, 0.73828125, ... | 204488 | 66.497784 |
| YEARS_BUILD_MODE | float16 | Numerical | 154 | [0.63427734375, 0.80419921875, nan, 0.74511718... | 204488 | 66.497784 |
| YEARS_BUILD_AVG | float16 | Numerical | 149 | [0.619140625, 0.7958984375, nan, 0.73486328125... | 204488 | 66.497784 |
| LANDAREA_MODE | float16 | Numerical | 3031 | [0.037689208984375, 0.0128021240234375, nan, 0... | 182590 | 59.376738 |
| LANDAREA_AVG | float16 | Numerical | 3011 | [0.036895751953125, 0.01300048828125, nan, 0.0... | 182590 | 59.376738 |
| LANDAREA_MEDI | float16 | Numerical | 3041 | [0.037506103515625, 0.0131988525390625, nan, 0... | 182590 | 59.376738 |
| BASEMENTAREA_MODE | float16 | Numerical | 3148 | [0.038299560546875, 0.053802490234375, nan, 0.... | 179943 | 58.515956 |
| BASEMENTAREA_MEDI | float16 | Numerical | 3121 | [0.036895751953125, 0.052886962890625, nan, 0.... | 179943 | 58.515956 |
| BASEMENTAREA_AVG | float16 | Numerical | 3127 | [0.036895751953125, 0.052886962890625, nan, 0.... | 179943 | 58.515956 |
| NONLIVINGAREA_AVG | float16 | Numerical | 2885 | [0.0, 0.00980377197265625, nan, 0.10009765625,... | 169682 | 55.179164 |
| NONLIVINGAREA_MEDI | float16 | Numerical | 2923 | [0.0, 0.01000213623046875, nan, 0.102172851562... | 169682 | 55.179164 |
| NONLIVINGAREA_MODE | float16 | Numerical | 2924 | [0.0, nan, 0.10601806640625, 0.00469970703125,... | 169682 | 55.179164 |
| ELEVATORS_AVG | float16 | Numerical | 257 | [0.0, 0.08001708984375, nan, 0.1600341796875, ... | 163891 | 53.29598 |
| ELEVATORS_MEDI | float16 | Numerical | 46 | [0.0, 0.08001708984375, nan, 0.1600341796875, ... | 163891 | 53.29598 |
| ELEVATORS_MODE | float16 | Numerical | 26 | [0.0, 0.08062744140625, nan, 0.1611328125, 0.4... | 163891 | 53.29598 |
| WALLSMATERIAL_MODE | category | category | 7 | [Stone, brick, Block, nan, Panel, Mixed, Woode... | 156341 | 50.840783 |
| APARTMENTS_AVG | float16 | Numerical | 2251 | [0.0247039794921875, 0.09588623046875, nan, 0.... | 156061 | 50.749729 |
| APARTMENTS_MODE | float16 | Numerical | 760 | [0.02520751953125, 0.0924072265625, nan, 0.083... | 156061 | 50.749729 |
| APARTMENTS_MEDI | float16 | Numerical | 1148 | [0.024993896484375, 0.0968017578125, nan, 0.08... | 156061 | 50.749729 |
| ENTRANCES_MEDI | float16 | Numerical | 46 | [0.0689697265625, 0.03448486328125, nan, 0.206... | 154828 | 50.348768 |
| ENTRANCES_MODE | float16 | Numerical | 30 | [0.0689697265625, 0.03448486328125, nan, 0.206... | 154828 | 50.348768 |
| ENTRANCES_AVG | float16 | Numerical | 285 | [0.0689697265625, 0.03448486328125, nan, 0.206... | 154828 | 50.348768 |
| LIVINGAREA_MEDI | float16 | Numerical | 3741 | [0.0193023681640625, 0.0557861328125, nan, 0.0... | 154350 | 50.193326 |
| LIVINGAREA_MODE | float16 | Numerical | 3746 | [0.019805908203125, 0.055389404296875, nan, 0.... | 154350 | 50.193326 |
| LIVINGAREA_AVG | float16 | Numerical | 3715 | [0.0189971923828125, 0.054901123046875, nan, 0... | 154350 | 50.193326 |
| HOUSETYPE_MODE | category | category | 3 | [block of flats, nan, terraced house, specific... | 154297 | 50.176091 |
| FLOORSMAX_MODE | float16 | Numerical | 25 | [0.08331298828125, 0.291748046875, nan, 0.1667... | 153020 | 49.760822 |
| FLOORSMAX_MEDI | float16 | Numerical | 49 | [0.08331298828125, 0.291748046875, nan, 0.1667... | 153020 | 49.760822 |
| FLOORSMAX_AVG | float16 | Numerical | 403 | [0.08331298828125, 0.291748046875, nan, 0.1667... | 153020 | 49.760822 |
| YEARS_BEGINEXPLUATATION_AVG | float16 | Numerical | 284 | [0.97216796875, 0.98486328125, nan, 0.98095703... | 150007 | 48.781019 |
| YEARS_BEGINEXPLUATATION_MEDI | float16 | Numerical | 244 | [0.97216796875, 0.98486328125, nan, 0.98095703... | 150007 | 48.781019 |
| YEARS_BEGINEXPLUATATION_MODE | float16 | Numerical | 221 | [0.97216796875, 0.98486328125, nan, 0.98095703... | 150007 | 48.781019 |
| TOTALAREA_MODE | float16 | Numerical | 3660 | [0.01490020751953125, 0.0714111328125, nan, 0.... | 148431 | 48.268517 |
| EMERGENCYSTATE_MODE | category | category | 2 | [No, nan, Yes] | 145755 | 47.398304 |
| TARGET | int8 | Numerical | 2 | [1, 0] | 0 | 0.0 |
Les informations sur l'environnement où vit le client sont d'une façon générale peu renseignées, et sont soit redondantes avec des informations plus pertinentes sur les revenus du client, soit non porteuses d'information.
application = application.drop(columns=total_list)
del total_list
gc.collect()
0
Dans l'optique de faire en sorte que les dimensions ne soient pas trop grandes lors de l'encoding, je traite ici les variables qualitatives. Je considère comme non-discriminantes des variables qualitatives ayant une valeur sur-représentée pour les deux targets. Par ailleurs, je vais regrouper sous la même appellation les valeurs les moins fréquentes.
print ("Liste des colonnes à valeurs catégorielles binaires")
print(list_binary_cat_cols(application))
print ("Valeurs les plus fréquentes pour les variables qualitatives pour Target = 0")
print(list_overfrequent_category_values(application[application['TARGET']==0],0.9))
print ("Valeurs les plus fréquentes pour les variables qualitatives pour Target = 1")
print(list_overfrequent_category_values(application[application['TARGET']==1],0.9))
print ("Valeurs les moins fréquentes pour les variables qualitatives")
print(list_unfrequent_category_values(application,0.01))
Liste des colonnes à valeurs catégorielles binaires
['NAME_CONTRACT_TYPE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']
Valeurs les plus fréquentes pour les variables qualitatives pour Target = 0
NAME_CONTRACT_TYPE
{'NAME_CONTRACT_TYPE': ['Cash loans']}
Valeurs les plus fréquentes pour les variables qualitatives pour Target = 1
NAME_CONTRACT_TYPE
{'NAME_CONTRACT_TYPE': ['Cash loans']}
Valeurs les moins fréquentes pour les variables qualitatives
NAME_TYPE_SUITE
NAME_INCOME_TYPE
NAME_HOUSING_TYPE
OCCUPATION_TYPE
ORGANIZATION_TYPE
{'NAME_TYPE_SUITE': ['Other_B', nan, 'Other_A', 'Group of people'], 'NAME_INCOME_TYPE': ['Unemployed', 'Student', 'Businessman', 'Maternity leave'], 'NAME_HOUSING_TYPE': ['Office apartment', 'Co-op apartment'], 'OCCUPATION_TYPE': ['Private service staff', 'Low-skill Laborers', 'Waiters/barmen staff', 'Secretaries', 'Realty agents', 'HR staff', 'IT staff'], 'ORGANIZATION_TYPE': ['Housing', 'Industry: type 11', 'Military', 'Bank', 'Agriculture', 'Police', 'Transport: type 2', 'Postal', 'Security Ministries', 'Trade: type 2', 'Restaurant', 'Services', 'University', 'Industry: type 7', 'Transport: type 3', 'Industry: type 1', 'Hotel', 'Electricity', 'Industry: type 4', 'Trade: type 6', 'Industry: type 5', 'Insurance', 'Telecom', 'Emergency', 'Industry: type 2', 'Advertising', 'Realtor', 'Culture', 'Industry: type 12', 'Trade: type 1', 'Mobile', 'Legal Services', 'Cleaning', 'Transport: type 1', 'Industry: type 6', 'Industry: type 10', 'Religion', 'Industry: type 13', 'Trade: type 4', 'Trade: type 5', 'Industry: type 8']}
Définition d'une fonction destinée à "condenser" les variables qualitatives en diminuant le nombre de modes.
def condense_category_app(df):
print("Application samples: {}".format(len(df)))
df['NAME_TYPE_SUITE'] = df['NAME_TYPE_SUITE'].replace(['Other_B', 'Other_A', 'Group of people'], 'Unfrequent')
df['NAME_INCOME_TYPE'] = df['NAME_INCOME_TYPE'].replace(['Unemployed', 'Student', 'Businessman', 'Maternity leave'], 'Unfrequent')
df['OCCUPATION_TYPE'] = df['OCCUPATION_TYPE'].replace(
['Private service staff', 'Low-skill Laborers', 'Waiters/barmen staff', 'Secretaries',
'Realty agents', 'HR staff', 'IT staff'], 'Unfrequent')
df['ORGANIZATION_TYPE'] = df['ORGANIZATION_TYPE'].replace(
['Housing', 'Industry: type 11', 'Military', 'Bank', 'Agriculture', 'Police', 'Transport: type 2',
'Postal', 'Security Ministries', 'Trade: type 2', 'Restaurant', 'Services', 'University', 'Industry: type 7',
'Transport: type 3', 'Industry: type 1', 'Hotel', 'Electricity', 'Industry: type 4', 'Trade: type 6',
'Industry: type 5', 'Insurance', 'Telecom', 'Emergency', 'Industry: type 2', 'Advertising',
'Realtor', 'Culture', 'Industry: type 12', 'Trade: type 1', 'Mobile', 'Legal Services',
'Cleaning', 'Transport: type 1', 'Industry: type 6', 'Industry: type 10', 'Religion',
'Industry: type 13', 'Trade: type 4', 'Trade: type 5', 'Industry: type 8'], 'Unfrequent')
return df
show_infos(application[application['FLAG_OWN_CAR'] == 'Y'][['OWN_CAR_AGE','TARGET']])
| name | total_null | nulls for target=0 | nulls for target=1 | % nulls for target=0 | % nulls for target=1 | type | |
|---|---|---|---|---|---|---|---|
| 0 | OWN_CAR_AGE | 5 | 5 | 0 | 0.01 | 0.0 | float16 |
| 1 | TARGET | 0 | 0 | 0 | 0.00 | 0.0 | int8 |
def fe_application(df, nan_as_category=True):
# Suppression de colonnes que je juge non-pertinentes, en particulier CODE_GENDER
df = df.drop(columns=['NAME_CONTRACT_TYPE','CODE_GENDER','DAYS_REGISTRATION','DAYS_LAST_PHONE_CHANGE','DAYS_ID_PUBLISH',
'WEEKDAY_APPR_PROCESS_START','HOUR_APPR_PROCESS_START','EXT_SOURCE_1','AMT_REQ_CREDIT_BUREAU_HOUR',
'AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK','OBS_30_CNT_SOCIAL_CIRCLE','OBS_60_CNT_SOCIAL_CIRCLE'])
# Categorical features with Binary encode (0 or 1; two categories)
for bin_feature in ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
df[bin_feature], uniques = pd.factorize(df[bin_feature])
df = condense_category_app(df)
# Ajout de l'âge
df['AGE'] = round(-df.DAYS_BIRTH/365)
# Mise à NaN de la valeur par défaut de DAYS_EMPLOYED
df['DAYS_EMPLOYED'].replace(3.652430e+05, np.nan, inplace=True)
# Some simple new features (percentages)
df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
df = df.drop(columns=['DAYS_BIRTH'])
# Categorical features with One-Hot encoder
df, _ = one_hot_encoder(df, nan_as_category)
gc.collect()
return df
application = fe_application(application)
Application samples: 307511
show_infos(application)
| name | total_null | nulls for target=0 | nulls for target=1 | % nulls for target=0 | % nulls for target=1 | type | |
|---|---|---|---|---|---|---|---|
| 11 | OWN_CAR_AGE | 202929 | 185680 | 17249 | 65.68 | 69.48 | float16 |
| 28 | EXT_SOURCE_3 | 60965 | 55288 | 5677 | 19.56 | 22.87 | float16 |
| 55 | DAYS_EMPLOYED_PERC | 55374 | 52384 | 2990 | 18.53 | 12.04 | float64 |
| 10 | DAYS_EMPLOYED | 55374 | 52384 | 2990 | 18.53 | 12.04 | float64 |
| 51 | AMT_REQ_CREDIT_BUREAU_MON | 41519 | 37227 | 4292 | 13.17 | 17.29 | float16 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 41 | FLAG_DOCUMENT_12 | 0 | 0 | 0 | 0.00 | 0.00 | int8 |
| 40 | FLAG_DOCUMENT_11 | 0 | 0 | 0 | 0.00 | 0.00 | int8 |
| 39 | FLAG_DOCUMENT_10 | 0 | 0 | 0 | 0.00 | 0.00 | int8 |
| 38 | FLAG_DOCUMENT_9 | 0 | 0 | 0 | 0.00 | 0.00 | int8 |
| 123 | ORGANIZATION_TYPE_nan | 0 | 0 | 0 | 0.00 | 0.00 | uint8 |
124 rows × 7 columns
summary_df = eda.df_info(application)
summary_df[summary_df['num_missing'] > 0].sort_values(by='perc_missing', ascending=False)
| data_type | data_type_grp | num_unique_values | sample_unique_values | num_missing | perc_missing | |
|---|---|---|---|---|---|---|
| OWN_CAR_AGE | float16 | Numerical | 62 | [nan, 26.0, 17.0, 8.0, 23.0, 7.0, 14.0, 1.0, 3... | 202929 | 65.99081 |
| EXT_SOURCE_3 | float16 | Numerical | 814 | [0.139404296875, nan, 0.7294921875, 0.62109375... | 60965 | 19.825307 |
| DAYS_EMPLOYED | float64 | Numerical | 12573 | [-637.0, -1188.0, -225.0, -3039.0, -3038.0, -1... | 55374 | 18.007161 |
| DAYS_EMPLOYED_PERC | float64 | Numerical | 249434 | [0.0673290349857309, 0.07086191470325082, 0.01... | 55374 | 18.007161 |
| AMT_REQ_CREDIT_BUREAU_MON | float16 | Numerical | 24 | [0.0, nan, 1.0, 2.0, 6.0, 5.0, 3.0, 7.0, 9.0, ... | 41519 | 13.501631 |
| AMT_REQ_CREDIT_BUREAU_QRT | float16 | Numerical | 11 | [0.0, nan, 1.0, 2.0, 4.0, 3.0, 8.0, 5.0, 6.0, ... | 41519 | 13.501631 |
| AMT_REQ_CREDIT_BUREAU_YEAR | float16 | Numerical | 25 | [1.0, 0.0, nan, 2.0, 4.0, 5.0, 3.0, 8.0, 6.0, ... | 41519 | 13.501631 |
| DEF_30_CNT_SOCIAL_CIRCLE | float16 | Numerical | 10 | [2.0, 0.0, 1.0, nan, 3.0, 4.0, 5.0, 6.0, 7.0, ... | 1021 | 0.332021 |
| DEF_60_CNT_SOCIAL_CIRCLE | float16 | Numerical | 9 | [2.0, 0.0, 1.0, nan, 3.0, 5.0, 4.0, 7.0, 24.0,... | 1021 | 0.332021 |
| EXT_SOURCE_2 | float16 | Numerical | 6888 | [0.262939453125, 0.6220703125, 0.55615234375, ... | 660 | 0.214626 |
| AMT_GOODS_PRICE | float32 | Numerical | 1002 | [351000.0, 1129500.0, 135000.0, 297000.0, 5130... | 278 | 0.090403 |
| AMT_ANNUITY | float32 | Numerical | 13672 | [24700.5, 35698.5, 6750.0, 29686.5, 21865.5, 2... | 12 | 0.003902 |
| ANNUITY_INCOME_PERC | float32 | Numerical | 88804 | [0.1219777762889862, 0.1322166621685028, 0.100... | 12 | 0.003902 |
| PAYMENT_RATE | float32 | Numerical | 39101 | [0.06074926629662514, 0.027598323300480843, 0.... | 12 | 0.003902 |
| CNT_FAM_MEMBERS | float16 | Numerical | 17 | [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 9.0, 7.0, 8.0, ... | 2 | 0.00065 |
| INCOME_PER_PERSON | float32 | Numerical | 3064 | [202500.0, 135000.0, 67500.0, 121500.0, 49500.... | 2 | 0.00065 |
J'ai choisi de faire une imputation des valeurs vides par la médiane
columns = application.columns
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
values = imp_median.fit_transform(application)
application = pd.DataFrame(values, columns=columns)
reduce_mem_usage(application)
Memory usage of dataframe is 290.92 MB Memory usage after optimization is: 76.25 MB Decreased by 73.8%
def remove_absent_id_curr(df):
initial_len = len(df)
df = df[df.SK_ID_CURR.isin(application.SK_ID_CURR)]
print('%s rows have been removed' %(initial_len-len(df)))
bureau_balance = read_data(data_dir+'bureau_balance.csv')
Shape of the dataframe :(27299925, 3) Memory usage of dataframe is 624.85 MB Memory usage after optimization is: 156.21 MB Decreased by 75.0%
eda.df_info(bureau_balance)
| data_type | data_type_grp | num_unique_values | sample_unique_values | num_missing | perc_missing | |
|---|---|---|---|---|---|---|
| SK_ID_BUREAU | int32 | Numerical | 817395 | [5715448, 5715449, 5715451, 5715452, 5715453, ... | 0 | 0.0 |
| MONTHS_BALANCE | int8 | Numerical | 97 | [0, -1, -2, -3, -4, -5, -6, -7, -8, -9] | 0 | 0.0 |
| STATUS | category | category | 8 | [C, 0, X, 1, 2, 3, 5, 4] | 0 | 0.0 |
print ("Liste des colonnes à valeurs catégorielles binaires")
print(list_binary_cat_cols(bureau_balance))
print ("Valeurs les plus fréquentes pour les variables qualitatives")
print(list_overfrequent_category_values(bureau_balance,0.9))
print ("Valeurs les moins fréquentes pour les variables qualitatives")
print(list_unfrequent_category_values(bureau_balance,0.01))
Liste des colonnes à valeurs catégorielles binaires
[]
Valeurs les plus fréquentes pour les variables qualitatives
{}
Valeurs les moins fréquentes pour les variables qualitatives
STATUS
{'STATUS': ['1', '5', '2', '3', '4']}
bureau = read_data(data_dir+'bureau.csv')
initial_len = len(bureau)
bureau = bureau[bureau.SK_ID_CURR.isin(application.SK_ID_CURR)]
print('%s rows have been removed' %(initial_len-len(bureau)))
Shape of the dataframe :(1716428, 17) Memory usage of dataframe is 222.62 MB Memory usage after optimization is: 78.57 MB Decreased by 64.7% 251103 rows have been removed
eda.df_info(bureau).sort_values(by='num_unique_values', ascending=True)
| data_type | data_type_grp | num_unique_values | sample_unique_values | num_missing | perc_missing | |
|---|---|---|---|---|---|---|
| CREDIT_ACTIVE | category | category | 4 | [Closed, Active, Sold, Bad debt] | 0 | 0.0 |
| CREDIT_CURRENCY | category | category | 4 | [currency 1, currency 2, currency 4, currency 3] | 0 | 0.0 |
| CNT_CREDIT_PROLONG | int8 | Numerical | 10 | [0, 2, 1, 4, 3, 5, 9, 8, 6, 7] | 0 | 0.0 |
| CREDIT_TYPE | category | category | 15 | [Consumer credit, Credit card, Mortgage, Car l... | 0 | 0.0 |
| CREDIT_DAY_OVERDUE | int16 | Numerical | 893 | [0, 2603, 6, 30, 2156, 496, 186, 2264, 41, 8] | 0 | 0.0 |
| AMT_CREDIT_SUM_OVERDUE | float32 | Numerical | 1440 | [0.0, 231.52499389648438, 288.0, 58.5, 504.0, ... | 0 | 0.0 |
| DAYS_ENDDATE_FACT | float16 | Numerical | 2486 | [-153.0, nan, -1710.0, -840.0, -825.0, -187.0,... | 544673 | 37.170798 |
| DAYS_CREDIT | int16 | Numerical | 2923 | [-497, -208, -203, -629, -273, -43, -1896, -11... | 0 | 0.0 |
| DAYS_CREDIT_UPDATE | int32 | Numerical | 2980 | [-131, -20, -16, -21, -31, -22, -1710, -840, -... | 0 | 0.0 |
| DAYS_CREDIT_ENDDATE | float16 | Numerical | 6999 | [-153.0, 1075.0, 528.0, nan, 1197.0, 27456.0, ... | 89098 | 6.080426 |
| AMT_ANNUITY | float32 | Numerical | 30103 | [nan, 0.0, 2691.0, 24462.0, 8181.0, 8061.20996... | 1130013 | 77.116885 |
| AMT_CREDIT_SUM_LIMIT | float32 | Numerical | 45290 | [nan, 108982.6171875, 0.0, 228320.09375, 411.6... | 489670 | 33.41716 |
| AMT_CREDIT_MAX_OVERDUE | float32 | Numerical | 61066 | [nan, 77674.5, 0.0, 14985.0, 310.5, 20493.2695... | 948545 | 64.732738 |
| AMT_CREDIT_SUM_DEBT | float32 | Numerical | 202909 | [0.0, 171342.0, nan, 71017.3828125, 42103.8007... | 223094 | 15.224882 |
| AMT_CREDIT_SUM | float32 | Numerical | 211513 | [91323.0, 225000.0, 464323.5, 90000.0, 2700000... | 3 | 0.000205 |
| SK_ID_CURR | int32 | Numerical | 263491 | [215354, 162297, 402440, 238881, 222183, 42615... | 0 | 0.0 |
| SK_ID_BUREAU | int32 | Numerical | 1465325 | [5714462, 5714463, 5714464, 5714465, 5714466, ... | 0 | 0.0 |
print ("Liste des colonnes à valeurs catégorielles binaires")
print(list_binary_cat_cols(bureau))
print ("Valeurs les plus fréquentes pour les variables qualitatives")
print(list_overfrequent_category_values(bureau,0.9))
print ("Valeurs les moins fréquentes pour les variables qualitatives")
print(list_unfrequent_category_values(bureau,0.01))
Liste des colonnes à valeurs catégorielles binaires
[]
Valeurs les plus fréquentes pour les variables qualitatives
CREDIT_CURRENCY
{'CREDIT_CURRENCY': ['currency 1']}
Valeurs les moins fréquentes pour les variables qualitatives
CREDIT_ACTIVE
CREDIT_CURRENCY
CREDIT_TYPE
{'CREDIT_ACTIVE': ['Sold', 'Bad debt'], 'CREDIT_CURRENCY': ['currency 2', 'currency 3', 'currency 4'], 'CREDIT_TYPE': ['Microloan', 'Loan for business development', 'Another type of loan', 'Unknown type of loan', 'Loan for working capital replenishment', 'Cash loan (non-earmarked)', 'Real estate loan', 'Loan for the purchase of equipment', 'Loan for purchase of shares (margin lending)', 'Interbank credit', 'Mobile operator loan']}
def fe_bureau_and_balance(bureau, bb, nan_as_category=True):
bb['MONTHS_BALANCE'] = -bb['MONTHS_BALANCE']
bb['STATUS'] = bb['STATUS'].replace(['1','5', '2', '3', '4'], '1_5')
bureau['CREDIT_TYPE'] = bureau['CREDIT_TYPE'].replace(['Loan for business development',
'Another type of loan',
'Unknown type of loan',
'Loan for working capital replenishment',
'Cash loan (non-earmarked)',
'Real estate loan',
'Loan for the purchase of equipment',
'Loan for purchase of shares (margin lending)',
'Interbank credit',
'Mobile operator loan'], 'Unfrequent')
bureau.drop(columns=['CREDIT_CURRENCY'], inplace=True)
bb, bb_cat = one_hot_encoder(bb, nan_as_category)
bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
for col in bb_cat:
bb_aggregations[col] = ['mean']
bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper()
for e in bb_agg.columns.tolist()])
# Encodage des catégories
bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
# Bureau balance: merge with bureau.csv
#
bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
bureau.drop(['SK_ID_BUREAU'], axis=1, inplace=True)
del bb, bb_agg
gc.collect()
# Bureau and bureau_balance numeric features
num_aggregations = {
'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
'DAYS_CREDIT_UPDATE': ['mean'],
'CREDIT_DAY_OVERDUE': ['max', 'mean'],
'AMT_CREDIT_MAX_OVERDUE': ['mean'],
'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
'AMT_CREDIT_SUM_OVERDUE': ['mean'],
'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
'AMT_ANNUITY': ['max', 'mean'],
'CNT_CREDIT_PROLONG': ['sum'],
'MONTHS_BALANCE_MIN': ['min'],
'MONTHS_BALANCE_MAX': ['max'],
'MONTHS_BALANCE_SIZE': ['mean', 'sum']
}
# Bureau and bureau_balance categorical features
cat_aggregations = {}
for cat in bureau_cat:
cat_aggregations[cat] = ['mean']
for cat in bb_cat:
cat_aggregations[cat + "_MEAN"] = ['mean']
bureau_agg = bureau.groupby('SK_ID_CURR').agg(
{**num_aggregations, **cat_aggregations})
bureau_agg.columns = pd.Index(
['BUREAU_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
# Bureau: Active credits - using only numerical aggregations
active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
active_agg.columns = pd.Index(
['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
del active, active_agg
gc.collect()
# Bureau: Closed credits - using only numerical aggregations
closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
closed_agg.columns = pd.Index(
['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
del closed, closed_agg, bureau
gc.collect()
return bureau_agg
bureau_and_bb = fe_bureau_and_balance(bureau, bureau_balance)
del bureau, bureau_balance
gc.collect()
reduce_mem_usage(bureau_and_bb)
Memory usage of dataframe is 144.49 MB Memory usage after optimization is: 70.61 MB Decreased by 51.1%
previous_application = read_data(data_dir+'previous_application.csv')
initial_len = len(previous_application)
previous_application = previous_application[previous_application.SK_ID_CURR.isin(application.SK_ID_CURR)]
print('%s rows with SK_ID_CURR not appearing in Application have been removed' %(initial_len-len(previous_application)))
Shape of the dataframe :(1670214, 37) Memory usage of dataframe is 471.48 MB Memory usage after optimization is: 130.62 MB Decreased by 72.3% 256513 rows with SK_ID_CURR not appearing in Application have been removed
print ("Liste des colonnes à valeurs catégorielles binaires")
print(list_binary_cat_cols(previous_application))
print ("Valeurs les plus fréquentes pour les variables qualitatives")
print(list_overfrequent_category_values(previous_application,0.9))
print ("Valeurs les moins fréquentes pour les variables qualitatives")
print(list_unfrequent_category_values(previous_application, 0.01))
Liste des colonnes à valeurs catégorielles binaires
['FLAG_LAST_APPL_PER_CONTRACT']
Valeurs les plus fréquentes pour les variables qualitatives
FLAG_LAST_APPL_PER_CONTRACT
{'FLAG_LAST_APPL_PER_CONTRACT': ['Y']}
Valeurs les moins fréquentes pour les variables qualitatives
NAME_CASH_LOAN_PURPOSE
NAME_PAYMENT_TYPE
CODE_REJECT_REASON
NAME_TYPE_SUITE
NAME_GOODS_CATEGORY
CHANNEL_TYPE
NAME_SELLER_INDUSTRY
PRODUCT_COMBINATION
{'NAME_CASH_LOAN_PURPOSE': ['Other', 'Urgent needs', 'Buying a used car', 'Building a house or an annex', 'Everyday expenses', 'Medicine', 'Payments on other loans', 'Education', 'Journey', 'Purchase of electronic equipment', 'Buying a new car', 'Wedding / gift / holiday', 'Buying a home', 'Car repairs', 'Furniture', 'Buying a holiday home / land', 'Business development', 'Gasification / water supply', 'Buying a garage', 'Hobby', 'Money for a third person', 'Refusal to name the goal'], 'NAME_PAYMENT_TYPE': ['Non-cash from your account', 'Cashless from the account of the employer'], 'CODE_REJECT_REASON': ['SCOFR', 'XNA', 'VERIF', 'SYSTEM'], 'NAME_TYPE_SUITE': ['Other_A', 'Group of people'], 'NAME_GOODS_CATEGORY': ['Auto Accessories', 'Jewelry', 'Homewares', 'Medical Supplies', 'Vehicles', 'Sport and Leisure', 'Other', 'Gardening', 'Office Appliances', 'Tourism', 'Medicine', 'Direct Sales', 'Fitness', 'Additional Service', 'Education', 'Weapon', 'Insurance', 'Animals', 'House Construction'], 'CHANNEL_TYPE': ['Channel of corporate sales', 'Car dealer'], 'NAME_SELLER_INDUSTRY': ['Auto technology', 'Jewelry', 'MLM partners', 'Tourism'], 'PRODUCT_COMBINATION': ['POS industry without interest', 'POS others without interest', nan]}
show_category_details(previous_application)
NAME_CONTRACT_TYPE Cash loans 0.443350 Consumer loans 0.442283 Revolving loans 0.114146 XNA 0.000221 Name: NAME_CONTRACT_TYPE, dtype: float64 WEEKDAY_APPR_PROCESS_START WEDNESDAY 0.152450 TUESDAY 0.152124 MONDAY 0.151736 FRIDAY 0.150932 THURSDAY 0.149257 SATURDAY 0.144415 SUNDAY 0.099086 Name: WEEKDAY_APPR_PROCESS_START, dtype: float64 FLAG_LAST_APPL_PER_CONTRACT Y 0.994826 N 0.005174 Name: FLAG_LAST_APPL_PER_CONTRACT, dtype: float64 NAME_CASH_LOAN_PURPOSE XAP 0.556650 XNA 0.401323 Repairs 0.014230 Other 0.009501 Urgent needs 0.005118 Buying a used car 0.001746 Building a house or an annex 0.001658 Everyday expenses 0.001452 Medicine 0.001323 Payments on other loans 0.001113 Education 0.000944 Journey 0.000737 Purchase of electronic equipment 0.000661 Buying a new car 0.000627 Wedding / gift / holiday 0.000567 Buying a home 0.000496 Car repairs 0.000489 Furniture 0.000467 Buying a holiday home / land 0.000328 Business development 0.000254 Gasification / water supply 0.000178 Buying a garage 0.000082 Hobby 0.000032 Money for a third person 0.000016 Refusal to name the goal 0.000009 Name: NAME_CASH_LOAN_PURPOSE, dtype: float64 NAME_CONTRACT_STATUS Approved 0.626794 Canceled 0.183519 Refused 0.173580 Unused offer 0.016107 Name: NAME_CONTRACT_STATUS, dtype: float64 NAME_PAYMENT_TYPE Cash through the bank 0.624389 XNA 0.369923 Non-cash from your account 0.005022 Cashless from the account of the employer 0.000666 Name: NAME_PAYMENT_TYPE, dtype: float64 CODE_REJECT_REASON XAP 0.810308 HC 0.103264 LIMIT 0.033793 SCO 0.023086 CLIENT 0.016107 SCOFR 0.007693 XNA 0.003097 VERIF 0.002178 SYSTEM 0.000475 Name: CODE_REJECT_REASON, dtype: float64 NAME_TYPE_SUITE NaN 0.491385 Unaccompanied 0.302493 Family 0.128655 Spouse, partner 0.040734 Children 0.019174 Other_B 0.010662 Other_A 0.005532 Group of people 0.001365 Name: NAME_TYPE_SUITE, dtype: float64 NAME_CLIENT_TYPE Repeater 0.734018 New 0.183589 Refreshed 0.081301 XNA 0.001091 Name: NAME_CLIENT_TYPE, dtype: float64 NAME_GOODS_CATEGORY XNA 5.639163e-01 Mobile 1.370467e-01 Consumer Electronics 7.412105e-02 Computers 6.360822e-02 Audio/Video 6.048096e-02 Furniture 3.214187e-02 Photo / Cinema Equipment 1.535473e-02 Construction Materials 1.510645e-02 Clothing and Accessories 1.385724e-02 Auto Accessories 4.405458e-03 Jewelry 3.669800e-03 Homewares 3.048028e-03 Medical Supplies 2.291857e-03 Vehicles 2.018814e-03 Sport and Leisure 1.796702e-03 Other 1.653108e-03 Gardening 1.642497e-03 Office Appliances 1.404116e-03 Tourism 9.365488e-04 Medicine 9.209868e-04 Direct Sales 2.334298e-04 Fitness 1.174223e-04 Additional Service 7.710258e-05 Education 6.295532e-05 Weapon 4.597861e-05 Insurance 4.102706e-05 Animals 7.073632e-07 House Construction 0.000000e+00 Name: NAME_GOODS_CATEGORY, dtype: float64 NAME_PORTFOLIO POS 0.419084 Cash 0.277048 XNA 0.217311 Cards 0.086288 Cars 0.000269 Name: NAME_PORTFOLIO, dtype: float64 NAME_PRODUCT_TYPE XNA 0.636664 x-sell 0.272614 walk-in 0.090722 Name: NAME_PRODUCT_TYPE, dtype: float64 CHANNEL_TYPE Credit and cash offices 0.424660 Country-wide 0.299258 Stone 0.129807 Regional / Local 0.065411 Contact center 0.041657 AP+ (Cash loan) 0.035102 Channel of corporate sales 0.003817 Car dealer 0.000287 Name: CHANNEL_TYPE, dtype: float64 NAME_SELLER_INDUSTRY XNA 0.505992 Consumer electronics 0.241707 Connectivity 0.168600 Furniture 0.034637 Construction 0.017979 Clothing 0.014074 Industry 0.011719 Auto technology 0.002886 Jewelry 0.001490 MLM partners 0.000651 Tourism 0.000265 Name: NAME_SELLER_INDUSTRY, dtype: float64 NAME_YIELD_GROUP XNA 0.303599 middle 0.229873 high 0.216496 low_normal 0.194443 low_action 0.055590 Name: NAME_YIELD_GROUP, dtype: float64 PRODUCT_COMBINATION Cash 0.166302 POS household with interest 0.160192 POS mobile with interest 0.135000 Cash X-Sell: middle 0.084909 Cash X-Sell: low 0.078234 Card Street 0.066912 POS industry with interest 0.059070 POS household without interest 0.050590 Card X-Sell 0.047233 Cash Street: high 0.036570 Cash X-Sell: high 0.035935 Cash Street: middle 0.021174 Cash Street: low 0.020226 POS other with interest 0.014366 POS mobile without interest 0.014069 POS industry without interest 0.007513 POS others without interest 0.001482 NaN 0.000221 Name: PRODUCT_COMBINATION, dtype: float64
Fonction qui permet de "condenser" les variables qualitatives en réduisant le nombre de modes
def condense_category_values(previous_application):
a = ['Family','Spouse, partner','Children']
previous_application["NAME_TYPE_SUITE"] = previous_application["NAME_TYPE_SUITE"].replace(a, 'Family')
a_bis = ['Other_B', 'Other_A', 'Group of people']
previous_application["NAME_TYPE_SUITE"] = previous_application["NAME_TYPE_SUITE"].replace(a_bis, 'Other_company')
# NAME_GOODS_CATEGORY
a = ['Auto Accessories', 'Jewelry', 'Homewares', 'Medical Supplies', 'Vehicles', 'Sport and Leisure', 'Other', 'Gardening',
'Office Appliances', 'Tourism', 'Medicine', 'Direct Sales', 'Fitness', 'Additional Service', 'Education', 'Weapon',
'Insurance', 'Animals', 'House Construction']
previous_application["NAME_GOODS_CATEGORY"] = previous_application["NAME_GOODS_CATEGORY"].replace(
a, 'Other')
# NAME_CASH_LOAN_PURPOSE
a = ['Buying a used car', 'Building a house or an annex', 'Everyday expenses', 'Medicine', 'Payments on other loans',
'Education', 'Journey', 'Purchase of electronic equipment', 'Buying a new car', 'Wedding / gift / holiday', 'Buying a home',
'Car repairs', 'Furniture', 'Buying a holiday home / land', 'Business development', 'Gasification / water supply',
'Buying a garage', 'Hobby', 'Money for a third person', 'Refusal to name the goal']
previous_application["NAME_CASH_LOAN_PURPOSE"] = previous_application["NAME_CASH_LOAN_PURPOSE"].replace(
a, 'Other')
# CODE_REJECT_REASON
a = ['XNA', 'VERIF', 'SYSTEM']
previous_application["CODE_REJECT_REASON"] = previous_application["CODE_REJECT_REASON"].replace(
a, 'XNA_VERIF_SYSTEM')
# CHANNEL_TYPE
a = ['Channel of corporate sales', 'Car dealer']
previous_application["CHANNEL_TYPE"] = previous_application["CHANNEL_TYPE"].replace(
a, 'Other_Channel')
# NAME_SELLER_INDUSTRY
a = ['Auto technology', 'Jewelry', 'MLM partners', 'Tourism']
previous_application["NAME_SELLER_INDUSTRY"] = previous_application["NAME_SELLER_INDUSTRY"].replace(
a, 'Other_Industry')
return previous_application
def fe_previous_application(prev, nan_as_category=True):
prev = condense_category_values(prev)
prev.drop(columns=['FLAG_LAST_APPL_PER_CONTRACT','WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START'], inplace=True)
for bin_feature in ['NFLAG_INSURED_ON_APPROVAL', 'NFLAG_LAST_APPL_IN_DAY']:
prev[bin_feature], uniques = pd.factorize(prev[bin_feature])
prev, cat_cols = one_hot_encoder(prev, nan_as_category)
# Days 365.243 values -> nan
prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)
# Add feature: value ask / value received percentage
prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
# Previous applications numeric features
num_aggregations = {
'AMT_ANNUITY': ['min', 'max', 'mean'],
'AMT_APPLICATION': ['min', 'max', 'mean'],
'AMT_CREDIT': ['min', 'max', 'mean'],
'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
'AMT_GOODS_PRICE': ['sum', 'mean'],
'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
'DAYS_DECISION': ['min', 'max', 'mean'],
'CNT_PAYMENT': ['mean', 'sum'],
}
# Previous applications categorical features
cat_aggregations = {}
for cat in cat_cols:
cat_aggregations[cat] = ['mean']
prev_agg = prev.groupby('SK_ID_CURR').agg(
{**num_aggregations, **cat_aggregations})
prev_agg.columns = pd.Index(
['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
# Previous Applications: Approved Applications - only numerical features
approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
approved_agg.columns = pd.Index(
['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
# Previous Applications: Refused Applications - only numerical features
refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
refused_agg.columns = pd.Index(
['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
del refused, refused_agg, approved, approved_agg, prev
gc.collect()
return prev_agg
previous_application = fe_previous_application(previous_application)
reduce_mem_usage(previous_application)
Memory usage of dataframe is 312.55 MB Memory usage after optimization is: 127.68 MB Decreased by 59.1%
credit_card_balance = read_data(data_dir+'credit_card_balance.csv')
initial_len = len(credit_card_balance)
credit_card_balance = credit_card_balance[credit_card_balance.SK_ID_CURR.isin(application.SK_ID_CURR)]
print('%s rows have been removed' %(initial_len-len(credit_card_balance)))
Shape of the dataframe :(3840312, 23) Memory usage of dataframe is 673.88 MB Memory usage after optimization is: 263.69 MB Decreased by 60.9% 612347 rows have been removed
eda.df_info(credit_card_balance)
| data_type | data_type_grp | num_unique_values | sample_unique_values | num_missing | perc_missing | |
|---|---|---|---|---|---|---|
| SK_ID_PREV | int32 | Numerical | 87452 | [2582071, 1389973, 1891521, 2181852, 1235299, ... | 0 | 0.0 |
| SK_ID_CURR | int32 | Numerical | 86905 | [363914, 337855, 126868, 367360, 203885, 34033... | 0 | 0.0 |
| MONTHS_BALANCE | int8 | Numerical | 96 | [-1, -4, -5, -3, -2, -19, -13, -18, -15, -12] | 0 | 0.0 |
| AMT_BALANCE | float32 | Numerical | 1182264 | [63975.5546875, 236572.109375, 453919.46875, 2... | 0 | 0.0 |
| AMT_CREDIT_LIMIT_ACTUAL | int32 | Numerical | 167 | [45000, 225000, 450000, 292500, 135000, 270000... | 0 | 0.0 |
| AMT_DRAWINGS_ATM_CURRENT | float32 | Numerical | 2095 | [2250.0, 0.0, 90000.0, 76500.0, 10800.0, 4500.... | 605754 | 18.765817 |
| AMT_DRAWINGS_CURRENT | float32 | Numerical | 159227 | [2250.0, 11547.0, 289339.4375, 111026.703125, ... | 0 | 0.0 |
| AMT_DRAWINGS_OTHER_CURRENT | float32 | Numerical | 1650 | [0.0, 137700.0, nan, 46800.0, 187200.0, 22950.... | 605754 | 18.765817 |
| AMT_DRAWINGS_POS_CURRENT | float32 | Numerical | 143768 | [0.0, 11547.0, 199339.421875, 34526.69921875, ... | 605754 | 18.765817 |
| AMT_INST_MIN_REGULARITY | float32 | Numerical | 278880 | [2250.0, 11795.759765625, 22924.890625, 130.5,... | 264384 | 8.190423 |
| AMT_PAYMENT_CURRENT | float32 | Numerical | 142261 | [2250.0, 11925.0, 27000.0, 4093.514892578125, ... | 620093 | 19.210029 |
| AMT_PAYMENT_TOTAL_CURRENT | float32 | Numerical | 156928 | [2250.0, 11925.0, 27000.0, 4093.514892578125, ... | 0 | 0.0 |
| AMT_RECEIVABLE_PRINCIPAL | float32 | Numerical | 1045413 | [60175.078125, 224949.28125, 443044.40625, 285... | 0 | 0.0 |
| AMT_RECIVABLE | float32 | Numerical | 1174181 | [64875.5546875, 233048.96875, 453919.46875, 28... | 0 | 0.0 |
| AMT_TOTAL_RECEIVABLE | float32 | Numerical | 1174292 | [64875.5546875, 233048.96875, 453919.46875, 28... | 0 | 0.0 |
| CNT_DRAWINGS_ATM_CURRENT | float16 | Numerical | 44 | [1.0, 0.0, 3.0, 2.0, 5.0, nan, 6.0, 4.0, 8.0, ... | 605754 | 18.765817 |
| CNT_DRAWINGS_CURRENT | int16 | Numerical | 127 | [1, 8, 9, 2, 30, 0, 4, 7, 3, 5] | 0 | 0.0 |
| CNT_DRAWINGS_OTHER_CURRENT | float16 | Numerical | 11 | [0.0, 1.0, nan, 3.0, 2.0, 4.0, 6.0, 5.0, 7.0, ... | 605754 | 18.765817 |
| CNT_DRAWINGS_POS_CURRENT | float16 | Numerical | 132 | [0.0, 1.0, 5.0, 6.0, 29.0, 4.0, 8.0, nan, 14.0... | 605754 | 18.765817 |
| CNT_INSTALMENT_MATURE_CUM | float16 | Numerical | 121 | [69.0, 10.0, 101.0, 3.0, 38.0, 6.0, 27.0, 2.0,... | 264384 | 8.190423 |
| NAME_CONTRACT_STATUS | category | category | 7 | [Active, Completed, Signed, Demand, Sent propo... | 0 | 0.0 |
| SK_DPD | int16 | Numerical | 844 | [0, 2192, 7, 717, 28, 18, 1676, 1311, 1158, 1462] | 0 | 0.0 |
| SK_DPD_DEF | int16 | Numerical | 308 | [0, 7, 1, 701, 8, 5, 63, 31, 62, 122] | 0 | 0.0 |
print ("Valeurs les plus fréquentes pour les variables qualitatives")
print(list_overfrequent_category_values(credit_card_balance,0.9))
print ("Valeurs les moins fréquentes pour les variables qualitatives")
list_unfrequent_category_values(credit_card_balance,0.001)
Valeurs les plus fréquentes pour les variables qualitatives
NAME_CONTRACT_STATUS
{'NAME_CONTRACT_STATUS': ['Active']}
Valeurs les moins fréquentes pour les variables qualitatives
NAME_CONTRACT_STATUS
{'NAME_CONTRACT_STATUS': ['Demand', 'Sent proposal', 'Refused', 'Approved']}
def fe_credit_card_balance(cc,nan_as_category=True):
a = ['Signed','Demand','Sent proposal','Approved']
cc["NAME_CONTRACT_STATUS"] = cc["NAME_CONTRACT_STATUS"].replace(a, 'Unfrequent')
cc, cat_cols = one_hot_encoder(cc, nan_as_category)
cc['MONTHS_BALANCE'] = -cc['MONTHS_BALANCE']
# General aggregations
cc.drop(['SK_ID_PREV'], axis=1, inplace=True)
cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper()
for e in cc_agg.columns.tolist()])
# Count credit card lines
cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
del cc
gc.collect()
return cc_agg
credit_card_balance = fe_credit_card_balance(credit_card_balance)
reduce_mem_usage(credit_card_balance)
Memory usage of dataframe is 43.18 MB Memory usage after optimization is: 31.41 MB Decreased by 27.3%
POS_CASH_balance = read_data(data_dir+'POS_CASH_balance.csv')
initial_len = len(POS_CASH_balance)
POS_CASH_balance = POS_CASH_balance[POS_CASH_balance.SK_ID_CURR.isin(application.SK_ID_CURR)]
print('%s rows have been removed' %(initial_len-len(POS_CASH_balance)))
Shape of the dataframe :(10001358, 8) Memory usage of dataframe is 610.43 MB Memory usage after optimization is: 171.69 MB Decreased by 71.9% 1457983 rows have been removed
eda.df_info(POS_CASH_balance)
| data_type | data_type_grp | num_unique_values | sample_unique_values | num_missing | perc_missing | |
|---|---|---|---|---|---|---|
| SK_ID_PREV | int32 | Numerical | 800337 | [1803195, 1715348, 1784872, 1903291, 2341044, ... | 0 | 0.0 |
| SK_ID_CURR | int32 | Numerical | 289444 | [182943, 367990, 397406, 269225, 334279, 34216... | 0 | 0.0 |
| MONTHS_BALANCE | int8 | Numerical | 96 | [-31, -33, -32, -35, -38, -39, -34, -37, -41, ... | 0 | 0.0 |
| CNT_INSTALMENT | float16 | Numerical | 72 | [48.0, 36.0, 12.0, 24.0, 60.0, 18.0, 4.0, 25.0... | 21863 | 0.255906 |
| CNT_INSTALMENT_FUTURE | float16 | Numerical | 78 | [45.0, 35.0, 9.0, 42.0, 12.0, 43.0, 36.0, 16.0... | 21878 | 0.256081 |
| NAME_CONTRACT_STATUS | category | category | 9 | [Active, Signed, Completed, Approved, Returned... | 0 | 0.0 |
| SK_DPD | int16 | Numerical | 3358 | [0, 1, 2, 4, 3, 18, 7, 5, 12, 6] | 0 | 0.0 |
| SK_DPD_DEF | int16 | Numerical | 1987 | [0, 1, 2, 4, 3, 18, 7, 5, 12, 8] | 0 | 0.0 |
print ("Valeurs les plus fréquentes pour les variables qualitatives")
print(list_overfrequent_category_values(POS_CASH_balance,0.9))
print ("Valeurs les moins fréquentes pour les variables qualitatives")
print(list_unfrequent_category_values(POS_CASH_balance,0.001))
Valeurs les plus fréquentes pour les variables qualitatives
NAME_CONTRACT_STATUS
{'NAME_CONTRACT_STATUS': ['Active']}
Valeurs les moins fréquentes pour les variables qualitatives
NAME_CONTRACT_STATUS
{'NAME_CONTRACT_STATUS': ['Demand', 'Returned to the store', 'Approved', 'Amortized debt', 'Canceled', 'XNA']}
def fe_pos_cash(pos, nan_as_category=True):
a = ['Demand', 'Returned to the store', 'Approved', 'Amortized debt', 'Canceled', 'XNA']
pos['NAME_CONTRACT_STATUS']= pos['NAME_CONTRACT_STATUS'].replace(a, 'Unfrequent')
pos, cat_cols = one_hot_encoder(pos, nan_as_category=True)
# Features
aggregations = {
'MONTHS_BALANCE': ['max', 'mean', 'size'],
'SK_DPD': ['max', 'mean'],
'SK_DPD_DEF': ['max', 'mean']
}
for cat in cat_cols:
aggregations[cat] = ['mean']
pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
pos_agg.columns = pd.Index(
['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
# Count pos cash accounts
pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
del pos
gc.collect()
return pos_agg
POS_CASH_balance = fe_pos_cash(POS_CASH_balance)
reduce_mem_usage(POS_CASH_balance)
Memory usage of dataframe is 25.67 MB Memory usage after optimization is: 9.11 MB Decreased by 64.5%
installments_payments = read_data(data_dir+'installments_payments.csv')
initial_len = len(installments_payments)
installments_payments = installments_payments[installments_payments.SK_ID_CURR.isin(application.SK_ID_CURR)]
print('%s rows have been removed' %(initial_len-len(installments_payments)))
Shape of the dataframe :(13605401, 8) Memory usage of dataframe is 830.41 MB Memory usage after optimization is: 311.40 MB Decreased by 62.5% 2013809 rows have been removed
eda.df_info(installments_payments)
| data_type | data_type_grp | num_unique_values | sample_unique_values | num_missing | perc_missing | |
|---|---|---|---|---|---|---|
| SK_ID_PREV | int32 | Numerical | 853344 | [1054186, 1330831, 2085231, 2452527, 2714724, ... | 0 | 0.0 |
| SK_ID_CURR | int32 | Numerical | 291643 | [161674, 151639, 193053, 199697, 167756, 16448... | 0 | 0.0 |
| NUM_INSTALMENT_VERSION | float16 | Numerical | 55 | [1.0, 0.0, 2.0, 4.0, 3.0, 5.0, 7.0, 8.0, 6.0, ... | 0 | 0.0 |
| NUM_INSTALMENT_NUMBER | int16 | Numerical | 277 | [6, 34, 1, 3, 2, 12, 11, 4, 14, 8] | 0 | 0.0 |
| DAYS_INSTALMENT | float16 | Numerical | 2485 | [-1180.0, -2156.0, -63.0, -2418.0, -1383.0, -1... | 0 | 0.0 |
| DAYS_ENTRY_PAYMENT | float16 | Numerical | 2554 | [-1187.0, -2156.0, -63.0, -2426.0, -1366.0, -1... | 2583 | 0.022283 |
| AMT_INSTALMENT | float32 | Numerical | 831095 | [6948.35986328125, 1716.5250244140625, 25425.0... | 0 | 0.0 |
| AMT_PAYMENT | float32 | Numerical | 875277 | [6948.35986328125, 1716.5250244140625, 25425.0... | 2583 | 0.022283 |
def fe_install_payments(ins, nan_as_category=True):
ins, cat_cols = one_hot_encoder(ins, nan_as_category)
# Percentage and difference paid in each installment (amount paid and installment value)
ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
# Days past due and days before due (no negative values)
ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
# Features: Perform aggregations
aggregations = {
'NUM_INSTALMENT_VERSION': ['nunique'],
'DPD': ['max', 'mean', 'sum'],
'DBD': ['max', 'mean', 'sum'],
'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
'AMT_INSTALMENT': ['max', 'mean', 'sum'],
'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
}
for cat in cat_cols:
aggregations[cat] = ['mean']
ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
ins_agg.columns = pd.Index(
['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
# Count installments accounts
ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
del ins
gc.collect()
return ins_agg
installments_payments = fe_install_payments(installments_payments)
reduce_mem_usage(installments_payments)
Memory usage of dataframe is 39.49 MB Memory usage after optimization is: 26.42 MB Decreased by 33.1%
print("shape:", application.shape)
with timer("Merge Application with bureau and bureau_balance"):
df = application.merge(bureau_and_bb, how='left', on='SK_ID_CURR')
print("shape:", application.shape)
del application
gc.collect()
with timer("Merge with previous_application"):
df = df.merge(previous_application, how='left', on='SK_ID_CURR')
del previous_application
print("shape:", df.shape)
gc.collect()
with timer("Merge with POS-CASH balance"):
df = df.join(POS_CASH_balance, how='left', on='SK_ID_CURR')
del POS_CASH_balance
print("shape:", df.shape)
gc.collect()
with timer("Merge with installments payments"):
df = df.join(installments_payments, how='left', on='SK_ID_CURR')
del installments_payments
print("shape:", df.shape)
gc.collect()
with timer("Merge with credit card balance"):
df = df.join(credit_card_balance, how='left', on='SK_ID_CURR')
del credit_card_balance
print("shape:", df.shape)
gc.collect()
shape: (307511, 124) shape: (307511, 124) Merge Application with bureau and bureau_balance - done in 2s shape: (307511, 400) Merge with previous_application - done in 2s shape: (307511, 413) Merge with POS-CASH balance - done in 1s shape: (307511, 439) Merge with installments payments - done in 1s shape: (307511, 565) Merge with credit card balance - done in 1s
# Liste des colonnes vides avec plus de 60% de valeurs vides
X = df.drop(columns = ['SK_ID_CURR','TARGET'])
emptiest_columns = list_emptiest_columns(X, 0.6)
df = df.drop(columns=emptiest_columns)
print("Suppression de %s colonnes" %len(emptiest_columns))
Suppression de 173 colonnes
A ce stade, je préfère imputer les valeurs manquantes par zéro. En effet, l'encodage des variables qualitatives a déjà été fait, avec des colonnes _nan. Une valeur manquante signifie l'inexistence totale de l'individu, à savoir ni une précédente valeur vide, ni non-vide d'où la valeur zéro, de même pour les variables numériques.
# Remplacement des valeurs vides ou infinies par 0
df.replace([np.inf, -np.inf], 0, inplace=True)
df = df.fillna(0)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 307511 entries, 0 to 307510 Columns: 392 entries, SK_ID_CURR to INSTAL_COUNT dtypes: float16(296), float32(81), float64(15) memory usage: 306.2 MB
df.to_pickle('../../gen_data/data_before_feature_selection.pkl')
Dans le cadre de la lutte contre le fléau dimensionnel, je sélectionne 100 colonnes les plus importantes avec SelectKbest.
to_ignore = ['SK_ID_CURR','TARGET']
all_columns = df.columns
print("Before ")
print(df.info())
features_to_check = [feature for feature in all_columns if feature not in to_ignore]
X = df[features_to_check]
y = df[['TARGET']]
selector = SelectKBest(f_classif, k = 100).fit(X,y)
k_best_features = selector.get_feature_names_out()
to_keep = list(['SK_ID_CURR','TARGET']) + list(k_best_features)
df = df[to_keep]
print("After ")
print(df.info())
Before <class 'pandas.core.frame.DataFrame'> Int64Index: 307511 entries, 0 to 307510 Columns: 392 entries, SK_ID_CURR to INSTAL_COUNT dtypes: float16(296), float32(81), float64(15) memory usage: 306.2 MB None After <class 'pandas.core.frame.DataFrame'> Int64Index: 307511 entries, 0 to 307510 Columns: 102 entries, SK_ID_CURR to INSTAL_DAYS_ENTRY_PAYMENT_SUM dtypes: float16(86), float32(11), float64(5) memory usage: 77.4 MB None
Sauvegarde des données qui seront utilisées pour l'apprentissage
df.to_pickle('../../gen_data/data_to_train.pkl')
Fonction de condenser les données par feature via binning des valeurs et comptage. Ces données seront utilisées pour comparer le client courant aux autres clients de la base, par feature.
def build_binned_data(data,num_bins) :
features = data.drop(columns=['SK_ID_CURR','TARGET']).columns
grouped_and_binned_data_df = pd.DataFrame([], columns=['feature', 'bin','TARGET','count', 'percent_of_target'])
for col in features :
target_data = data[[col,'TARGET']]
target_data['feature'] = col
target_data['bin'], _ = pd.cut(target_data[col], num_bins, retbins=True)
target_data[['feature','bin', 'TARGET']].value_counts()
grouped_data = pd.DataFrame(target_data[['feature','bin', 'TARGET']].value_counts())
# Attention, il faut ordonner les bins pour la représentation graphique !
grouped_data = grouped_data.reset_index().sort_values(by='bin', ascending=True).rename(columns={0: 'count'})
# rajouter les pourcentage des target 0 et pourcentages des target 1
count1 = grouped_data[grouped_data.TARGET == 1]['count'].sum()
count0 = grouped_data[grouped_data.TARGET == 0]['count'].sum()
grouped_data.loc[grouped_data['TARGET'] == 1, 'percent_of_target'] = (grouped_data['count']/count1)*100
grouped_data.loc[grouped_data['TARGET'] == 0, 'percent_of_target'] = (grouped_data['count']/count0)*100
grouped_and_binned_data_df = pd.concat([grouped_and_binned_data_df, grouped_data])
del target_data, grouped_data, count0, count1
gc.collect()
return grouped_and_binned_data_df
Sauvegarde des données "condensées"
import re
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
binned_data_df = build_binned_data(df, num_bins =10)
binned_data_df.to_pickle('../../gen_data/binned_data.pkl')
binned_data_df
| feature | bin | TARGET | count | percent_of_target | |
|---|---|---|---|---|---|
| 0 | AMT_CREDIT | (40995.0, 445500.0] | 0.0 | 114825 | 40.619274 |
| 5 | AMT_CREDIT | (40995.0, 445500.0] | 1.0 | 10284 | 41.425982 |
| 1 | AMT_CREDIT | (445500.0, 846000.0] | 0.0 | 102444 | 36.239502 |
| 4 | AMT_CREDIT | (445500.0, 846000.0] | 1.0 | 10285 | 41.430010 |
| 2 | AMT_CREDIT | (846000.0, 1246500.0] | 0.0 | 41676 | 14.742860 |
| ... | ... | ... | ... | ... | ... |
| 7 | INSTAL_DAYS_ENTRY_PAYMENT_SUM | (-180746.7, -120497.8] | 1.0 | 881 | 3.548842 |
| 6 | INSTAL_DAYS_ENTRY_PAYMENT_SUM | (-120497.8, -60248.9] | 1.0 | 2129 | 8.576032 |
| 1 | INSTAL_DAYS_ENTRY_PAYMENT_SUM | (-120497.8, -60248.9] | 0.0 | 29717 | 10.512371 |
| 2 | INSTAL_DAYS_ENTRY_PAYMENT_SUM | (-60248.9, 0.0] | 1.0 | 21000 | 84.592145 |
| 0 | INSTAL_DAYS_ENTRY_PAYMENT_SUM | (-60248.9, 0.0] | 0.0 | 227637 | 80.526450 |
1542 rows × 5 columns
! jupyter nbconvert --to html data_cleaning_and_merge.ipynb